Exploratory Data Analysis

# interactive trends plot
p <- plot_ly(type = "scatter", mode = "lines")

trends <- trends %>%
  mutate(across(everything(), ~ ifelse(. == "<1", 0, .))) %>%
  mutate(across(c("Mark Ruffalo":"Aaron Eckhart"), ~ as.numeric(.))) %>%
  pivot_longer(cols = c("Mark Ruffalo":"Aaron Eckhart")) %>%
  mutate(Month = ym(Month))

trends %>%
  group_by(name) %>%
  group_split() %>%
  purrr::walk(function(df) {
    p <<- add_trace(p,
                    data = df,
                    x = ~Month,
                    y = ~value,
                    name = unique(df$name),
                    text = ~paste(name, value),
                    hoverinfo = "text",
                    visible = "legendonly")
  })

p %>%
  layout(
    title = "Monthly Search Interest for Selected Actors since 2004",
    xaxis = list(title = "Month", range = c("2004-01-01",
                                            "2025-03-01")),
    yaxis = list(title = "Google Trend Search Interest", range = c(0, 100)),
    legend = list(title = list(text = "Select an Actor"))
  )
# joining 
joined <- trends %>% 
  mutate(name = tolower(name)) %>%
  left_join(actors, by = c("name")) 
 
# by winner
joined %>% 
  filter(!is.na(type)) %>%
  group_by(Month, type) %>% summarise(value = mean(value)) %>%
  ggplot(aes(x = Month, y = value)) +
  geom_line() +
  labs(title = "Google Trendlines for all Oscar Nominated Actors",
       subtitle = "Nominated 2004-Present") +
  facet_wrap(vars(type))
## `summarise()` has grouped output by 'Month'. You can override using the
## `.groups` argument.

# This is pretty interesting - suggests that generally Oscar nominated actors are have searched more but it has leveled off (could reflect trends in Google Search overall). The cyclical pattern is also interesting and I'm guessing it coincides with the ceremony and nomination announcements!

joined %>% mutate(month = month(Month)) %>%
  filter(!is.na(type)) %>%
  group_by(type, month) %>%
  summarise(mean_value = mean(value)) %>%
  ungroup(month) %>%
  slice_max(order_by = mean_value, n = 3)
## `summarise()` has grouped output by 'type'. You can override using the
## `.groups` argument.
## # A tibble: 9 × 3
## # Groups:   type [3]
##   type       month mean_value
##   <chr>      <dbl>      <dbl>
## 1 both           1       14.5
## 2 both           2       13.4
## 3 both          12       13.1
## 4 commercial     1       13.1
## 5 commercial     7       13.1
## 6 commercial    12       12.7
## 7 oscar          1       12.0
## 8 oscar          2       11.6
## 9 oscar          3       11.0
wide <- trends %>%
  pivot_wider(names_from = Month, values_from = value) %>%
  tibble::column_to_rownames("name")
ts_matrix <- as.matrix(wide)

clusters <- tsclust(ts_matrix,
                    type = "partitional",
                    k = 5,  # tune this
                    distance = "sbd",
                    centroid = "shape",  # required with SBD
                    control = partitional_control(iter.max = 50),
                    seed = 42)

cluster_assignments <- clusters@cluster  # cluster labels for each series
distances <- clusters@cldist

# Create a data frame with rownames and cluster assignment
cluster_df <- data.frame(
  name = rownames(ts_matrix),
  cluster = cluster_assignments,
  distance = distances
)

actor_with_cluster <- left_join(trends, cluster_df, by = "name")

actor_with_cluster %>%
  group_by(Month, cluster) %>% summarise(value = mean(value)) %>%
  ggplot(aes(x = Month, y = value, colour = factor(cluster))) +
  geom_line(show.legend = FALSE) +
  scale_y_continuous(labels = scales::comma) +
  facet_wrap(vars(cluster)) +  # Separate plots for each cluster
  scale_color_solarized() +
  theme_minimal() +
  labs(title = "Time Series Clustering", colour = "Cluster")
## `summarise()` has grouped output by 'Month'. You can override using the
## `.groups` argument.

centroid_matrix <- do.call(rbind, clusters@centroids)

# Create a data frame from the matrix
centroid_df <- data.frame(Time = rep(1:ncol(centroid_matrix), nrow(centroid_matrix)),
                          Cluster = rep(1:nrow(centroid_matrix), each = ncol(centroid_matrix)),
                          Value = as.vector(centroid_matrix))


ggplot(centroid_df, aes(x = Time, y = Value, color = factor(Cluster))) +
  geom_line(size = 1) +
  facet_wrap(~ Cluster, ncol = 1) +
  labs(title = "Cluster Centroids", x = "Time", y = "Value", color = "Cluster") +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

actor_with_cluster %>% distinct(name, cluster, distance) %>%
  group_by(cluster) %>%
  slice_min(order_by = distance, n = 5)
## # A tibble: 25 × 3
## # Groups:   cluster [5]
##    name                   cluster distance
##    <chr>                    <int>    <dbl>
##  1 Chadwick Boseman             1   0.0156
##  2 Philip Seymour Hoffman       1   0.0166
##  3 Shi Pengyuan                 1   0.0182
##  4 William T. Hurtz             1   0.0182
##  5 Carrie Fisher                1   0.0198
##  6 Joshua David Neal            2   0.0488
##  7 Bérénice Bejo                2   0.0602
##  8 Calah Lane                   2   0.0736
##  9 Mickey Rourke                2   0.0799
## 10 Rooney Mara                  2   0.0817
## # ℹ 15 more rows
actor_with_cluster %>% distinct(name, cluster) %>%
  mutate(name = tolower(name)) %>%
  left_join(actors) %>%
  filter(!is.na(cluster)) %>%
  group_by(cluster) %>% mutate(total = n()) %>%
  group_by(cluster, nominee) %>%
  summarise(prop_nominees = n()/total) %>%
  filter(nominee == 1) %>% distinct()
## Joining with `by = join_by(name)`
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'cluster', 'nominee'. You can override
## using the `.groups` argument.
## # A tibble: 5 × 3
## # Groups:   cluster, nominee [5]
##   cluster nominee prop_nominees
##     <int>   <dbl>         <dbl>
## 1       1       1         0.414
## 2       2       1         0.504
## 3       3       1         0.169
## 4       4       1         0.305
## 5       5       1         0.357
years_seq <- 2004:2024

cluster_df <- cluster_df %>%
  distinct(name, cluster) %>%
  mutate(name = tolower(name))

features <- actors %>% 
  left_join(cluster_df, by = ("name")) %>% 
  select(name, age, gender, american, cluster)

winners <- actors %>%
  mutate(year_winner = str_remove_all(year_winner, "\\[|\\]")) %>%
  separate_rows(year_winner, sep = ",\\s*") %>%             
  mutate(year_winner = as.integer(year_winner)) %>%
  select(name, winner, year_winner)

# Cross join name and years_seq
winners_filled <- winners %>%
  # Create a data frame of all names and years
  distinct(name) %>%
  expand(name, year_winner = years_seq) %>%
  left_join(winners, by = c("name", "year_winner")) %>%
  rename(year = year_winner) %>%
  group_by(name) %>%
  mutate(
    won = ifelse(is.na(winner), "no", winner)
  ) %>%
  ungroup() %>%
  group_by(name) %>%
  mutate(
    won_previously = ifelse(lag(won != "no", default = FALSE), TRUE, FALSE)) %>%
  ungroup() %>%
  mutate(
    won_previously = ifelse(won_previously == TRUE, 1, 0)
  ) %>%
  group_by(name) %>%
  # Fill the 'nominated_previously' column down for each name
  mutate(won_previously = cummax(won_previously)) %>% 
  ungroup() %>%
  select(-won)

nominees <- actors %>%
  mutate(nominated_years = str_remove_all(nominated_years, "\\[|\\]")) %>%
  separate_rows(nominated_years, sep = ",\\s*") %>%             
  mutate(nominated_years = as.integer(nominated_years)) %>%
  select(name, nominated_years) %>%
  mutate(nominee = ifelse(!is.na(nominated_years), "yes", "no"))

nominees_filled <- nominees %>%
  # Create a data frame of all names and years
  distinct(name) %>%
  expand(name, nominated_years = years_seq) %>%
  left_join(nominees, by = c("name", "nominated_years")) %>%
  rename(year = nominated_years) %>%
  group_by(name) %>%
  mutate(
    nominated = ifelse(is.na(nominee), "no", nominee)  # Ensure no NAs in 'nominated' column
  ) %>%
  ungroup() %>%
  group_by(name) %>%
  mutate(
    nominated_previously = ifelse(lag(nominated == "yes", default = FALSE), TRUE, FALSE)
  ) %>%
  ungroup() %>%
  mutate(
    nominated_previously = as.integer(nominated_previously)
  ) %>%
  group_by(name) %>%
  # Fill the 'nominated_previously' column down for each name
  mutate(nominated_previously = cummax(nominated_previously)) %>% 
  ungroup() %>%
  select(-nominated)

winners_filled <- winners_filled %>%
  left_join(features, by = "name") %>%
  mutate(age = year - age)
prev_win <- winners_filled %>%
  select(name, year, won_previously)
nominees_filled <- nominees_filled %>%
  left_join(features, by = "name") %>%
  left_join(prev_win, by = c("name", "year")) %>%
  mutate(age = year - age)

# Step 1: Create trend time series and compute variance
trend_ts_data_all <- trends %>%
  mutate(year = year(Month),
         name = tolower(name)) %>%
  group_by(name, year) %>%
  arrange(Month) %>%
  summarise(
    ts_data = list(ts(value, frequency = 12)),
    var = var(value, na.rm = TRUE),
    .groups = "drop"
  )

# Step 2: Log zero-variance rows
zero_variance_log <- trend_ts_data_all %>%
  filter(is.na(var) | var == 0)

# Step 3: Keep only valid series
trend_ts_data <- trend_ts_data_all %>%
  filter(!is.na(var) & var > 0) %>%
  select(-var) %>%
  filter(year != 2025)

# Feature extraction + custom max spike height
trend_features_ts <- trend_ts_data %>%
  mutate(
    features = map(ts_data, ~ tsfeatures(.x)),  # Compute all available features
    max_spike_height = map_dbl(ts_data, ~ max(.x, na.rm = TRUE))
  ) %>%
  unnest(features)
## Error in ar.burg.default(x, aic = aic, order.max = order.max, na.action = na.action,  : 
##   zero-variance series
## Error in ar.burg.default(x, aic = aic, order.max = order.max, na.action = na.action,  : 
##   zero-variance series
## Error in ar.burg.default(x, aic = aic, order.max = order.max, na.action = na.action,  : 
##   zero-variance series
## Error in if (order) coefs[order, 1L:order] else numeric() : 
##   argument is not interpretable as logical
## Error in ar.burg.default(x, aic = aic, order.max = order.max, na.action = na.action,  : 
##   zero-variance series
## Error in ar.burg.default(x, aic = aic, order.max = order.max, na.action = na.action,  : 
##   zero-variance series
## Error in ar.burg.default(x, aic = aic, order.max = order.max, na.action = na.action,  : 
##   zero-variance series
## Error in ar.burg.default(x, aic = aic, order.max = order.max, na.action = na.action,  : 
##   zero-variance series
## Warning: There were 12112 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `features = map(ts_data, ~tsfeatures(.x))`.
## Caused by warning in `.f()`:
## ! Insufficient data to compute STL decomposition
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 12111 remaining warnings.
# --- 4. Merge with Oscar outcomes ---
model_data <- trend_features_ts %>%
  left_join(nominees_filled, by = c("name", "year")) %>%
  mutate(
    nominee = factor(ifelse(is.na(nominee), 0, 1))  # factor for classification
  ) %>%
  select(-frequency, -nperiods, -seasonal_period, -diff2_acf10, -seas_acf1)

# --- 5. Train/test split ---
set.seed(123)
data_split <- initial_split(model_data, prop = 0.8, strata = nominee)
train_data <- training(data_split)
test_data <- testing(data_split)

train_data <- train_data %>% select(-name, -ts_data, -year)
train_data_clean <- na.omit(train_data)

## undersampling

# Split the data into the two classes
non_nominee <- train_data_clean %>% filter(nominee == 0)
nominee <- train_data_clean %>% filter(nominee == 1)

# Randomly sample from the majority class (non-nominee) to match the minority class (nominee)
# Randomly sample the majority class (non-nominee) to match the minority class (nominee)
set.seed(42)  # Set seed for reproducibility

# Get the same proportion of non-nominee as nominee
if (nrow(non_nominee) >= nrow(nominee)) {
  set.seed(42)
  non_nominee_undersampled <- non_nominee[sample(nrow(non_nominee), nrow(nominee)), ]
  
  # Combine
  balanced_data <- bind_rows(non_nominee_undersampled, nominee)
  
  # Shuffle rows
  balanced_data <- balanced_data %>% sample_frac(1)
  
  # Confirm
  table(balanced_data$nominee)
} else {
  stop("Not enough non-nominee rows to sample from.")
}
## 
##   0   1 
## 298 298
# Combine the undersampled non-nominee with the full nominee class
balanced_data <- bind_rows(non_nominee_undersampled, nominee)

# Check the new balance of classes
table(balanced_data$nominee)
## 
##   0   1 
## 298 298
balanced_data$cluster <- as.factor(balanced_data$cluster)
balanced_data$gender <- as.factor(balanced_data$gender)
balanced_data$american <- as.factor(balanced_data$american)
balanced_data$won_previously <- as.factor(balanced_data$won_previously)
balanced_data$nominated_previously <- as.factor(balanced_data$nominated_previously)

balanced_data_1 <- balanced_data
balanced_data_2 <- balanced_data %>% select(-age, -gender, -american, -nominated_previously, -won_previously)

set.seed(1234)

# modeling all predictors 
data_split <- initial_split(balanced_data, prop = 0.8, strata = nominee)
train_data <- training(data_split)
test_data <- testing(data_split)

rf_model <- randomForest(nominee ~ ., data = train_data)

# Print model summary
print(rf_model)
## 
## Call:
##  randomForest(formula = nominee ~ ., data = train_data) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 4
## 
##         OOB estimate of  error rate: 28.78%
## Confusion matrix:
##     0   1 class.error
## 0 161  77   0.3235294
## 1  60 178   0.2521008
predictions <- predict(rf_model, newdata = test_data)

# Confusion Matrix
conf_matrix <- table(Predicted = predictions, Actual = test_data$nominee)

print(conf_matrix)
##          Actual
## Predicted  0  1
##         0 45 20
##         1 15 40
# Accuracy
accuracy <- sum(predictions == test_data$nominee) / nrow(test_data)
print(paste("Accuracy: ", accuracy))
## [1] "Accuracy:  0.708333333333333"
# You can use the caret package for more metrics like precision, recall, and F1 score
conf_matrix_caret <- confusionMatrix(predictions, test_data$nominee)

# Print evaluation metrics
print(conf_matrix_caret)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 45 20
##          1 15 40
##                                           
##                Accuracy : 0.7083          
##                  95% CI : (0.6184, 0.7877)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : 2.866e-06       
##                                           
##                   Kappa : 0.4167          
##                                           
##  Mcnemar's Test P-Value : 0.499           
##                                           
##             Sensitivity : 0.7500          
##             Specificity : 0.6667          
##          Pos Pred Value : 0.6923          
##          Neg Pred Value : 0.7273          
##              Prevalence : 0.5000          
##          Detection Rate : 0.3750          
##    Detection Prevalence : 0.5417          
##       Balanced Accuracy : 0.7083          
##                                           
##        'Positive' Class : 0               
## 
importance(rf_model)
##                      MeanDecreaseGini
## trend                       14.105240
## spike                       14.384914
## linearity                   46.933497
## curvature                   20.636132
## e_acf1                      12.100508
## e_acf10                     11.590256
## entropy                      9.266528
## x_acf1                      13.594978
## x_acf10                     11.704599
## diff1_acf1                  13.922044
## diff1_acf10                 11.871913
## diff2_acf1                  15.846169
## max_spike_height            11.997186
## nominated_previously         3.255137
## age                         14.057697
## gender                       1.465645
## american                     1.788679
## cluster                      7.852584
## won_previously               1.127954
varImpPlot(rf_model)

# modeling just time series predictors
data_split <- initial_split(balanced_data_2, prop = 0.8, strata = nominee)
train_data <- training(data_split)
test_data <- testing(data_split)

rf_model <- randomForest(nominee ~ ., data = train_data)

# Print model summary
print(rf_model)
## 
## Call:
##  randomForest(formula = nominee ~ ., data = train_data) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 30.46%
## Confusion matrix:
##     0   1 class.error
## 0 161  77   0.3235294
## 1  68 170   0.2857143
predictions <- predict(rf_model, newdata = test_data)

# Confusion Matrix
conf_matrix <- table(Predicted = predictions, Actual = test_data$nominee)

print(conf_matrix)
##          Actual
## Predicted  0  1
##         0 36 17
##         1 24 43
# Accuracy
accuracy <- sum(predictions == test_data$nominee) / nrow(test_data)
print(paste("Accuracy: ", accuracy))
## [1] "Accuracy:  0.658333333333333"
# You can use the caret package for more metrics like precision, recall, and F1 score
conf_matrix_caret <- confusionMatrix(predictions, test_data$nominee)

# Print evaluation metrics
print(conf_matrix_caret)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 36 17
##          1 24 43
##                                           
##                Accuracy : 0.6583          
##                  95% CI : (0.5662, 0.7424)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : 0.0003334       
##                                           
##                   Kappa : 0.3167          
##                                           
##  Mcnemar's Test P-Value : 0.3487367       
##                                           
##             Sensitivity : 0.6000          
##             Specificity : 0.7167          
##          Pos Pred Value : 0.6792          
##          Neg Pred Value : 0.6418          
##              Prevalence : 0.5000          
##          Detection Rate : 0.3000          
##    Detection Prevalence : 0.4417          
##       Balanced Accuracy : 0.6583          
##                                           
##        'Positive' Class : 0               
## 
importance(rf_model)
##                  MeanDecreaseGini
## trend                    15.81031
## spike                    16.26184
## linearity                50.35193
## curvature                21.73266
## e_acf1                   13.97131
## e_acf10                  11.87024
## entropy                  10.72542
## x_acf1                   14.57469
## x_acf10                  12.76247
## diff1_acf1               15.03577
## diff1_acf10              13.60832
## diff2_acf1               16.75506
## max_spike_height         13.76074
## cluster                  10.26029
varImpPlot(rf_model)